cmei24_OriginalHomeworkCode_03

1 Some of my best friends are Zombies…

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(curl)
## Using libcurl 8.7.1 with LibreSSL/3.3.6
## 
## Attaching package: 'curl'
## 
## The following object is masked from 'package:readr':
## 
##     parse_date

Load the zombie dataset from GitHub

zombie_data <- curl(url = 'https://raw.githubusercontent.com/fuzzyatelin/fuzzyatelin.github.io/refs/heads/master/AN588_Spring25/zombies.csv')

zombie_data <- read_csv(zombie_data, col_names = TRUE) # We want to keep the same name of columns from the original database
## Rows: 1000 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): first_name, last_name, gender, major
## dbl (6): id, height, weight, zombies_killed, years_of_education, age
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(zombie_data) # To see the variable class each column will be
## spc_tbl_ [1,000 × 10] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ id                : num [1:1000] 1 2 3 4 5 6 7 8 9 10 ...
##  $ first_name        : chr [1:1000] "Sarah" "Mark" "Brandon" "Roger" ...
##  $ last_name         : chr [1:1000] "Little" "Duncan" "Perez" "Coleman" ...
##  $ gender            : chr [1:1000] "Female" "Male" "Male" "Male" ...
##  $ height            : num [1:1000] 62.9 67.8 72.1 66.8 64.7 ...
##  $ weight            : num [1:1000] 132 146 153 130 132 ...
##  $ zombies_killed    : num [1:1000] 2 5 1 5 4 1 0 4 9 2 ...
##  $ years_of_education: num [1:1000] 1 3 1 6 3 4 4 0 3 3 ...
##  $ major             : chr [1:1000] "medicine/nursing" "criminal justice administration" "education" "energy studies" ...
##  $ age               : num [1:1000] 17.6 22.6 21.9 18.2 21.1 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   id = col_double(),
##   ..   first_name = col_character(),
##   ..   last_name = col_character(),
##   ..   gender = col_character(),
##   ..   height = col_double(),
##   ..   weight = col_double(),
##   ..   zombies_killed = col_double(),
##   ..   years_of_education = col_double(),
##   ..   major = col_character(),
##   ..   age = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>

Let’s take a look at how the raw data looks!

head(zombie_data)
## # A tibble: 6 × 10
##      id first_name last_name gender height weight zombies_killed
##   <dbl> <chr>      <chr>     <chr>   <dbl>  <dbl>          <dbl>
## 1     1 Sarah      Little    Female   62.9   132.              2
## 2     2 Mark       Duncan    Male     67.8   146.              5
## 3     3 Brandon    Perez     Male     72.1   153.              1
## 4     4 Roger      Coleman   Male     66.8   130.              5
## 5     5 Tammy      Powell    Female   64.7   132.              4
## 6     6 Anthony    Green     Male     71.2   153.              1
## # ℹ 3 more variables: years_of_education <dbl>, major <chr>, age <dbl>

1.1 Calculate the population mean and standard deviation for each quantitative random variable

Let’s load the population variance and standard deviation that we got from class!

# Population variance function
pop_v <- function(x) {
    sum((x - mean(x))^2)/(length(x))
}

# Population sd function (square root of variance)
pop_sd <- function(x) {
    sqrt(pop_v(x))
}

Use for loop to pick which columns we want to use and apply the functions to get the mean, standard deviation!

for (column in colnames(zombie_data)){
  col_values <- zombie_data[[column]] # extract the data from each column picked
  if (column != "id" && is.numeric(col_values)){ # We only want values that are numeric but we also don't want to include ID
    average <- mean(col_values)
  variance <- pop_v(col_values)
  s_dev <- pop_sd(col_values)
  print(paste(column, " has a mean of ", average, " a population variance of ", variance, " and a standard deviation of ", s_dev))
  print("------------------------------------------------------------------------------")
  }
}
## [1] "height  has a mean of  67.63009808295  a population variance of  18.5586064409787  and a standard deviation of  4.30797010678797"
## [1] "------------------------------------------------------------------------------"
## [1] "weight  has a mean of  143.90747819936  a population variance of  338.260407098612  and a standard deviation of  18.3918570867276"
## [1] "------------------------------------------------------------------------------"
## [1] "zombies_killed  has a mean of  2.992  a population variance of  3.053936  and a standard deviation of  1.74755142985836"
## [1] "------------------------------------------------------------------------------"
## [1] "years_of_education  has a mean of  2.996  a population variance of  2.807984  and a standard deviation of  1.67570403114631"
## [1] "------------------------------------------------------------------------------"
## [1] "age  has a mean of  20.04696112212  a population variance of  8.78282213384171  and a standard deviation of  2.96358265176487"
## [1] "------------------------------------------------------------------------------"

1.2 Use {ggplot} to make boxplots of each of these variables by gender.

for (column in colnames(zombie_data)){
  col_values <- zombie_data[[column]]
  if (column != "id" && is.numeric(col_values)){ # Again, we only want values that are numeric but we also don't want to include ID
    print(
      ggplot(zombie_data, aes(x = as.factor(gender), y = col_values, fill = as.factor(gender))) + # fill = gender will help us assign different colors according to the gender factors
  geom_boxplot() + 
  labs(y = column, x = "Gender", fill = "Gender")
    ) +
      theme_classic()
  }
}

1.4 Using histograms and Q-Q plots, check whether the quantitative variables seem to be drawn from a normal distribution. Which seem to be and which do not

par(mfrow = c(2, 3)) # Using this to show multiple figures c(rows, columns)

for (column in colnames(zombie_data)){ # Let's re-use the for loop used above!
  col_values <- zombie_data[[column]]
  if (column != "id" && is.numeric(col_values)){
    qqnorm(col_values, main = paste("Normal QQ plot ", column))
    qqline(col_values, col = "red") # To make the Q-Q plot more readable, we'll add a qqline.
  }
}

par(mfrow = c(1, 1)) # Reset it so the next figures don't have the same configuration

For all numerical columns except for zombies_killed and years_of_education, we can see that most plots lie on the Q-Q line. This means that they are normally distributed. The other two columns look strange! Let’s use a histogram to see what is up with them!

# Use for loop again and use ggplot to create a histogram to display value distribution

for (column in colnames(zombie_data)){
  col_values <- zombie_data[[column]]
  if (column != "id" && is.numeric(col_values)){
    print(
      ggplot(zombie_data, aes(x = col_values)) +
        geom_histogram(color = "black", fill = "lightblue") + 
        labs(y = "Frequency", x = column)) + 
        theme_minimal()
  }
}
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

The gaps that we see in zombies_killed and years_of_education are due to the minimum bins used by ggplot, for subsequent histograms, we can decrease this to only 10 bins. Still the distribution appears to skew to the left. Additionally, we know that zombies_killed and years_of_education are discrete values and not continuous. Count data like this and knowing that the distribution is skewed to the left, tells me that they could belong to a Poisson distribution. Let’s test this out!

# First create a random set of values that belong to a Poisson distribution
test_poisson_kills <- rpois(n = length(zombie_data$zombies_killed), lambda = mean(zombie_data$zombies_killed))
# The parameters are: same number of samples in our original dataset, and a lambda based on the mean of our original column value

par(mfrow = c(1,2)) # Create layout that has 1 row with 2 columns

#Plot histogram of original distribution
hist(zombie_data$zombies_killed, breaks = 10, col = "lightblue", main = "Observed Zombie Kills", xlab = "Zombie Kills")

# Plot histogram of Poisson proxy distribution
hist(test_poisson_kills, breaks = 10, col = "pink", main = "Poisson Fit Zombie Kills", xlab = " Predicted Zombie Kills")

Having them side by side tells us that zombie_kills is indeed following a Poisson distribution!

# We'll do the same for years_of_education!
test_poisson_education <- rpois(n = length(zombie_data$years_of_education), lambda = mean(zombie_data$years_of_education))

par(mfrow = c(1,2))

#Plot histogram of original distribution
hist(zombie_data$years_of_education, breaks = 10, col = "lightblue", main = "Observed Education Years", xlab = "Observed Years of Education")

# Plot histogram of Poisson proxy distribution
hist(test_poisson_education, breaks = 10, col = "pink", main = "Poisson Fit Education Years", xlab = " Predicted Years of Education")

par(mfrow = c(1,1)) # Return to normal paneling = 1 row, 1 column

1.5 Now use the sample() function to sample ONE subset of 30 zombie survivors (without replacement) from this population and calculate the mean and sample standard deviation for each variable.

set.seed(1)
sample_rows <- sample(nrow(zombie_data), 30) # First get a random index of the rows available
zombie_data_sample <- zombie_data[sample_rows,] # Use the sample_row as an index to get all the rows associated  with the number in the original zombie_data
zombie_data_sample
## # A tibble: 30 × 10
##       id first_name  last_name gender height weight zombies_killed
##    <dbl> <chr>       <chr>     <chr>   <dbl>  <dbl>          <dbl>
##  1   836 Susan       Hawkins   Female   69.4   133.              3
##  2   679 Benjamin    Ward      Male     62.0   140.              2
##  3   129 Martin      Wagner    Male     70.3   155.              2
##  4   930 Keith       Wagner    Male     63.2   133.              3
##  5   509 Doris       Lane      Female   62.3   134.              3
##  6   471 Jennifer    Williams  Female   62.3   145.              2
##  7   299 Kelly       Burke     Female   69.9   146.              5
##  8   270 Christopher Lee       Male     67.6   131.              2
##  9   978 Willie      Richards  Male     64.7   133.              5
## 10   187 Ann         Reid      Female   71.0   142.              2
## # ℹ 20 more rows
## # ℹ 3 more variables: years_of_education <dbl>, major <chr>, age <dbl>

1.6 Estimate the standard error for each variable, and construct the 95% confidence interval for each mean.

Let’s load the necessary functions to calculate the Standard Error, and confidence intervals. Since we found out that zombies_killed and years_of_education follow an Poisson distribution, we need to calculate a confidence interval based on this distribution.

# Standard Error
SE <- function(x) {
    sqrt(var(x)/length(x))
}

# Confidnce Interval based on Normal/Gaussian distribution
normalCI = function(x, CIlevel = 0.95) {
    upper = mean(x) + qnorm(1 - (1 - CIlevel)/2) * sqrt(var(x)/length(x))
    lower = mean(x) + qnorm((1 - CIlevel)/2) * sqrt(var(x)/length(x))
    ci <- c(lower, upper)
    return(ci)
}

# Poisson Confidence Interval (had to google this)

poisson_CI <- function(counts, conf_level = 0.95) {
  lambda_hat <- mean(counts)  # Estimate of lambda
  alpha <- 1 - conf_level
  
  # Compute confidence interval using Chi-Square
  lower_CI <- qchisq(alpha / 2, 2 * lambda_hat) / 2
  upper_CI <- qchisq(1 - alpha / 2, 2 * (lambda_hat + 1)) / 2
  
  return(c(lower_CI, upper_CI))
}
for (column in colnames(zombie_data_sample)){
  col_values <- zombie_data_sample[[column]] # extract the data from each column picked
  if (column != "id" && is.numeric(col_values)){ # We only want values that are numeric but we also don't want to include ID
    
    average <- mean(col_values)
    variance <- var(col_values) # we can use var() now because its a sample
    s_dev <- sd(col_values) # we can use sd() now because its a sample
    s_error <- SE(col_values)
    
    print(paste(column, " has a mean of ", average, " a population variance of ", variance, " a standard deviation of ", s_dev, " and a standard error of ", s_error))
    
    # When it comes to CI, we want to pick which CI function to use according to column identity
    # If columns are normally distributed:
    if(column != 'zombies_killed' && column != 'years_of_education'){
      confidence_interval <- normalCI(col_values)
      print(paste("Lower lvl confidence intervals for", column, "is:", confidence_interval[1]))
      print(paste("Upper lvl confidence intervals for", column, "is:", confidence_interval[2]))
    }
    
    # If columns are Poisson distributed:
    if (column == 'zombies_killed' | column == 'years of education'){
      confidence_interval <- poisson_CI(col_values)
      print(paste("Lower lvl confidence intervals for", column, "is:", confidence_interval[1]))
      print(paste("Upper lvl confidence intervals for", column, "is:", confidence_interval[2]))
    }
    
    print("------------------------------------------------------------------------------")
  }
}
## [1] "height  has a mean of  66.8852164586667  a population variance of  13.9505038630675  a standard deviation of  3.73503733088003  and a standard error of  0.681921399748961"
## [1] "Lower lvl confidence intervals for height is: 65.5486750748716"
## [1] "Upper lvl confidence intervals for height is: 68.2217578424618"
## [1] "------------------------------------------------------------------------------"
## [1] "weight  has a mean of  142.253628986667  a population variance of  263.301924599511  a standard deviation of  16.2265808043318  and a standard error of  2.96255477923763"
## [1] "Lower lvl confidence intervals for weight is: 136.447128317134"
## [1] "Upper lvl confidence intervals for weight is: 148.060129656199"
## [1] "------------------------------------------------------------------------------"
## [1] "zombies_killed  has a mean of  3.06666666666667  a population variance of  2.82298850574713  a standard deviation of  1.68017514139066  and a standard error of  0.306756608499699"
## [1] "Lower lvl confidence intervals for zombies_killed is: 0.647615208957731"
## [1] "Upper lvl confidence intervals for zombies_killed is: 8.86738521470661"
## [1] "------------------------------------------------------------------------------"
## [1] "years_of_education  has a mean of  2.96666666666667  a population variance of  2.86091954022989  a standard deviation of  1.69142529844799  and a standard error of  0.308810596764958"
## [1] "------------------------------------------------------------------------------"
## [1] "age  has a mean of  19.68656653  a population variance of  8.76441860139941  a standard deviation of  2.96047607681592  and a standard error of  0.540506509408826"
## [1] "Lower lvl confidence intervals for age is: 18.6271932381492"
## [1] "Upper lvl confidence intervals for age is: 20.7459398218508"
## [1] "------------------------------------------------------------------------------"

1.7 Now draw 99 more random samples of 30 zombie apocalypse survivors, and calculate the mean for each variable for each of these samples.

First create function that will extract all the necessary information:

get_means <- function(sample_data){
  average <- NULL
  sample_avg_columns <- NULL
  for (column in colnames(sample_data)){
  col_values <- sample_data[[column]] # extract the data from each column picked
  if (column != "id" && is.numeric(col_values)){ # We only want values that are numeric but we also don't want to include ID
    average <- c(average, mean(col_values)) # Append the new mean values to the list
    sample_avg_columns <- c(sample_avg_columns, column) # Create a list of the column names
  }
  }
  means_table <- data.frame( # Using the data above, we can create a new data frame
    column_name = sample_avg_columns, # Where column names are the ones we collected above
    mean_value = average, # Where values are the ones collected before
    stringsAsFactors = FALSE
    
    # This table is in long format meaning that the final column names that we want are actually the values in the column_name column
  )
  
  # To visualize this better, we turn these values into columns using the pivot_wider() function in dplyr
  means_table <- means_table %>% pivot_wider(
    names_from = column_name, # We create new columns based on the values under column_name
    values_from = mean_value # We assign to these new columns their respective values from the mean_value column
  )
}

Now that we have the function created let’s apply that to all 99 sampling events!

set.seed(1)
final_table <- get_means(zombie_data_sample) # We start our final table with the first sampling event we had!
for (i in 1:99){ # We create a loop for sampling 99 times
  sample_rows <- zombie_data[sample(nrow(zombie_data), 30, replace = TRUE),] # We simplify the sampling step we did for our initial sampling into a single line
  sampled_means <- get_means(sample_rows) # Apply the get_means() function to this new sample
  final_table <- rbind(final_table, sampled_means) # Append the means onto the final table
}

head(final_table)
## # A tibble: 6 × 5
##   height weight zombies_killed years_of_education   age
##    <dbl>  <dbl>          <dbl>              <dbl> <dbl>
## 1   66.9   142.           3.07               2.97  19.7
## 2   66.9   142.           3.03               3.03  19.8
## 3   68.8   149.           3.07               3.53  20.8
## 4   66.9   143.           2.8                3.07  19.3
## 5   67.7   142.           3.53               2.47  20.2
## 6   66.3   139.           2.97               3     18.7

1.8 What are the means and standard deviations of this distribution of means for each variable?

To calculate means:

means_of_means <- final_table %>% 
  summarise( # Use summarise to create a separate table for easier visualization
    mean_heights = mean(height),
    mean_weight = mean(weight),
    mean_kills = mean(zombies_killed),
    mean_education = mean(years_of_education),
    mean_age = mean(age)
  )

means_of_means
## # A tibble: 1 × 5
##   mean_heights mean_weight mean_kills mean_education mean_age
##          <dbl>       <dbl>      <dbl>          <dbl>    <dbl>
## 1         67.6        144.       2.95           2.94     20.0

1.9 How do the standard deviations of means compare to the standard errors estimated in [5]?

To calculate standard deviation:

sd_of_means <- final_table %>%
  summarise( # Use summarise to create a separate table for easier visualization
    sd_heights = sd(height), # Assuming these are samples and not a population!
    sd_weight = sd(weight),
    sd_kills = sd(zombies_killed),
    sd_education = sd(years_of_education),
    sd_age = sd(age)
  )

sd_of_means
## # A tibble: 1 × 5
##   sd_heights sd_weight sd_kills sd_education sd_age
##        <dbl>     <dbl>    <dbl>        <dbl>  <dbl>
## 1      0.754      3.29    0.355        0.322  0.533

These standard deviations are closer to the SE calculated before

1.10 What do these sampling distributions look like (a graph might help here)? Are they normally distributed? What about for those variables that you concluded were not originally drawn from a normal distribution?

# Final Q-Q plot check 
par(mfrow = c(2, 3)) # Using this to show multiple figures c(rows, columns)
for (column in colnames(final_table)){
  col_values <- final_table[[column]]
  qqnorm(col_values, main = paste("Normal QQ plot ", column))
  qqline(col_values, col = "red")
}

par(mfrow = c(1, 1)) # Reset it so the next figures don't have the same configuration

# Let's run the same for loop code used for histogram creation but this time on the means table (final_table)

for (column in colnames(final_table)){
  col_values <- final_table[[column]]
  if (column != "id" && is.numeric(col_values)){
    print(
      ggplot(final_table, aes(x = col_values)) +
        geom_histogram(color = "black", fill = "lightblue") + 
        labs(y = "Frequency", x = column)) + 
        theme_minimal()
  }
}
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

All columns look more normally distributed now! This means corroborates the Central Limit Theorem: sampling distribution of the mean will always be normally distributed, as long as the sample size is large enough